Variables:
Risk
Money
Security
Good time Help Success Proper Environment Tradition Creativity
Friends important Family important Leisure time Happiness Health (subjective) Satisfaction Freedom
Sex Age Country Wave Marital status Children Employment Education
library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V22, freedom = V46, marital_status = V55, children = V56, creativity = V80, money = V81, security = V82, goodtime = V83, help = V84, success = V85, risk = V86, proper = V87, environment = V88, tradition = V89, employment = V241, education = V238)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom, marital_status, children, creativity, money, security, goodtime, help, success, risk, proper, environment, tradition, employment, education)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia Brazil Bulgaria Burkina Faso Canada Chile
1003 1002 1421 1500 1001 1534 2164 1000
China Colombia Cyprus (G) Egypt Ethiopia Finland France Georgia
1991 3025 1050 3051 1500 1014 1001 1500
Germany Ghana Great Britain Guatemala Hong Kong Hungary India Indonesia
2064 1534 1041 1000 1252 1007 2001 2015
Iran Iraq Italy Japan Jordan Malaysia Mali Mexico
2667 2701 1012 1096 1200 1201 1534 1560
Moldova Morocco Netherlands New Zealand Norway Peru Poland Romania
1046 1200 1050 954 1025 1500 1000 1776
Russia Rwanda Slovenia South Africa South Korea Spain Sweden Switzerland
2033 1507 1037 2988 1200 1200 1003 1241
Taiwan Thailand Trinidad and Tobago Turkey Ukraine United States Uruguay Viet Nam
1227 1534 1002 1346 1000 1249 1000 1495
Zambia
1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
` ``{r} #rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, marital_status = V57, children = V58, employment = V229, education = V248, risk = V76, money = V71, security = V72, goodtime = V73, help = V74B, success = V75, proper = V77, environment = V78, tradition = V79, creativity = V70, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V23, freedom = V55 )
#select only the variables of interest
WV6_data <- WV6_data %>%
select(sex, age, country, wave, marital_status, children, employment, education, risk, money, security, goodtime, help, success, proper, environment, tradition, creativity, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia Australia Azerbaijan Belarus Brazil Chile
1200 1030 1100 1477 1002 1535 1486 1000
China Colombia Cyprus (G) Ecuador Egypt Estonia Georgia Germany
2300 1512 1000 1202 1523 1533 1202 2046
Ghana Haiti Hong Kong India Iraq Japan Jordan Kazakhstan
1552 1996 1000 4078 1200 2443 1200 1500
Kuwait Kyrgyzstan Lebanon Libya Malaysia Mexico Morocco Netherlands
1303 1500 1200 2131 1300 2000 1200 1902
New Zealand Nigeria Pakistan Palestine Peru Philippines Poland Qatar
841 1759 1200 1000 1210 1200 966 1060
Romania Russia Rwanda Singapore Slovenia South Africa South Korea Spain
1503 2500 1527 1972 1069 3531 1200 1189
Sweden Taiwan Thailand Trinidad and Tobago Tunisia Turkey Ukraine United States
1206 1238 1200 999 1205 1605 1500 2232
Uruguay Uzbekistan Yemen Zimbabwe
1000 1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 80
#number of participants
nrow(data)
[1] 173540
#exclusion of participants
data = subset(data, risk> 0 & sex > 0 & age > 0 & education > 0 & employment > 0 & marital_status > 0 & children >= 0)
data
NA
#number of males vs females (1 = males; 2 = females)
table(data$sex)
1 2
71689 77937
#create a categorical age variable
data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"
#gender variables
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
#average age of participants
mean(data$age)
[1] 41.59569
#age range
range(data$age)
[1] 15 99
#risk taking Frequency
library(ggplot2)
ggplot(data, aes(x = risk)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#age frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age vs risk taking
ggplot(data, aes(x = agecat, y = risk)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
NA
NA
#sex vs risk taking
ggplot(data, aes(as.factor(sex), risk))+
geom_boxplot()
summary(data)
sex age country wave family_important friends_important leisure_time happiness health satisfaction
Length:149626 Min. :15.0 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:28.0 1st Qu.:276.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000
Mode :character Median :39.0 Median :484.0 Median :6.000 Median : 1.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 7.000
Mean :41.6 Mean :481.5 Mean :5.552 Mean : 1.094 Mean : 1.661 Mean : 1.871 Mean : 1.865 Mean : 2.106 Mean : 6.755
3rd Qu.:53.0 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000
Max. :99.0 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 5.000 Max. :10.000
NA's :221 NA's :351 NA's :698 NA's :573 NA's :230 NA's :340
freedom marital_status children creativity money security goodtime help success risk
Min. :-5.000 Min. :1.000 Min. :0.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :-5.000 Min. :1.000
1st Qu.: 6.000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.: 2.000 1st Qu.:3.000
Median : 7.000 Median :1.000 Median :2.000 Median : 3.000 Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.00 Median : 3.000 Median :4.000
Mean : 7.004 Mean :2.715 Mean :1.843 Mean : 2.718 Mean : 3.846 Mean : 2.374 Mean : 3.273 Mean : 2.29 Mean : 2.951 Mean :3.801
3rd Qu.: 9.000 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.00 3rd Qu.: 4.000 3rd Qu.:5.000
Max. :10.000 Max. :6.000 Max. :8.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.00 Max. : 6.000 Max. :6.000
NA's :838 NA's :972 NA's :602 NA's :442 NA's :566 NA's :44862 NA's :703
proper environment tradition employment education country_lab agecat
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000 Min. :1.000 Length:149626 Length:149626
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character
Median : 2.000 Median : 2.000 Median : 2.000 Median :3.000 Median :5.000 Mode :character Mode :character
Mean : 2.533 Mean : 2.468 Mean : 2.511 Mean :3.406 Mean :5.501
3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :8.000 Max. :9.000
NA's :541 NA's :561 NA's :518
data = na.omit(data)
summary(data)
sex age country wave family_important friends_important leisure_time happiness health satisfaction
Length:101172 Min. :15.00 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:27.00 1st Qu.:268.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000
Mode :character Median :39.00 Median :458.0 Median :5.000 Median : 1.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 7.000
Mean :41.11 Mean :474.4 Mean :5.348 Mean : 1.099 Mean : 1.652 Mean : 1.893 Mean : 1.889 Mean : 2.098 Mean : 6.692
3rd Qu.:53.00 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000
Max. :99.00 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 5.000 Max. :10.000
freedom marital_status children creativity money security goodtime help success risk
Min. :-5.00 Min. :1.000 Min. :0.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000
1st Qu.: 5.00 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:3.000
Median : 7.00 Median :1.000 Median :2.000 Median : 2.000 Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.000 Median : 3.000 Median :4.000
Mean : 6.91 Mean :2.769 Mean :1.835 Mean : 2.699 Mean : 3.842 Mean : 2.363 Mean : 3.243 Mean : 2.281 Mean : 2.937 Mean :3.827
3rd Qu.: 9.00 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.:5.000
Max. :10.00 Max. :6.000 Max. :8.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :6.000
proper environment tradition employment education country_lab agecat
Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :1.000 Min. :1.000 Length:101172 Length:101172
1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character
Median : 2.000 Median : 2.000 Median : 2.00 Median :3.000 Median :5.000 Mode :character Mode :character
Mean : 2.538 Mean : 2.452 Mean : 2.51 Mean :3.467 Mean :5.309
3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.: 3.00 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.000 Max. : 6.00 Max. :8.000 Max. :9.000
#ris vs education
ggplot(data, aes(risk, education))+
geom_point()+
geom_smooth(method = "lm")
model = lm(risk ~ education, data = data)
summary(model)
Call:
lm(formula = risk ~ education, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.0532 -1.0532 0.1564 1.2612 2.3660
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.10560 0.01183 347.08 <2e-16 ***
education -0.05240 0.00202 -25.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.589 on 101170 degrees of freedom
Multiple R-squared: 0.00661, Adjusted R-squared: 0.0066
F-statistic: 673.1 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(risk, freedom))+
geom_point()+
geom_smooth(method = "lm")
model1 = lm(risk ~ freedom, data = data)
summary(model1)
Call:
lm(formula = risk ~ freedom, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.3968 -1.1100 0.1769 1.2247 2.3204
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.157773 0.014987 277.43 <2e-16 ***
freedom -0.047814 0.002045 -23.38 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.59 on 101170 degrees of freedom
Multiple R-squared: 0.005375, Adjusted R-squared: 0.005365
F-statistic: 546.7 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(as.factor(wave), risk))+
geom_boxplot()
ggplot(data, aes(risk, age))+
geom_point()+
geom_smooth(method = "lm")
```